MNIST using TensorFlow


In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import time

Load data:


In [2]:
train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')
train_raw.info()
test_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Columns: 784 entries, pixel0 to pixel783
dtypes: int64(784)
memory usage: 167.5 MB

In [3]:
x_train = np.array(train_raw.drop(['label'], axis=1)) # Each row is a data point
# Normalization:
#x_train_mean = np.mean(x_train, axis=0)
#x_train_std = np.std(x_train, axis=0)
#x_train = (x_train - x_train_mean) / x_train_std

In [4]:
y_train = np.array(train_raw['label']).reshape(-1)
y_train = np.eye(10)[y_train] # Make it one hot

In [5]:
# Configure multithreading:
config = tf.ConfigProto(device_count={"CPU": 4}, # use 4 CPU cores
                inter_op_parallelism_threads = 4, # number of inter threads
                intra_op_parallelism_threads = 32, # number of inner threads
                log_device_placement=True)

Logistic Regression:


In [6]:
# Note: the convention in TensorFlow is Y = X^T * W + b
x = tf.placeholder(tf.float32, shape=[None, 784]) # None means any shape is fine
y_ = tf.placeholder(tf.float32, shape=[None, 10])
W = tf.Variable(tf.zeros([784,10]), dtype=tf.float32)
b = tf.Variable(tf.zeros([10]), dtype=tf.float32)
y = tf.matmul(x, W) + b

In [7]:
# Use cross entropy + l2 as loss function
l2_reg_logi = tf.nn.l2_loss(W)
cross_entropy_logi = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
loss_logi = cross_entropy_logi + 0.01 * l2_reg_logi
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(loss_logi)
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [8]:
# Run training step 1000 times using batch updates from training data
run_time = time.time()
with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer()) # Initialization!
    train_size = x_train.shape[0]
    for epoch in range(10):
        i = 0
        while i < train_size:
            if i + 100 < train_size:
                train_step.run(feed_dict={x: x_train[i:i+100, :], y_: y_train[i:i+100]})
            else:
                train_step.run(feed_dict={x: x_train[i:train_size, :], y_: y_train[i:train_size]})
            i += 100 # batch size = 100
    print('Logistic Regression training accuracy: ', accuracy.eval(feed_dict={x: x_train, y_: y_train}))
run_time = time.time() - run_time
print('Logistic Regression total running time: ', run_time)


Logistic Regression training accuracy:  0.872905
Logistic Regression total running time:  3.8324930667877197

Convolution Neural Network:


In [9]:
# We use a CNN with 3 hidden conv layers, each followed by a max pooling layer, then a fully connected layer
# First conv layer (3 * 3 * 32)
x_image = tf.reshape(x, [-1, 28, 28, 1]) # Reshape x to a rank 4 tensor: (# images) * 28H * 28W * 1Channel
                                         # Note: -1 means the # of dimensions in that axis is computed to keep total dim fixed
W_conv1 = tf.Variable(tf.truncated_normal([3, 3, 1, 32], stddev=0.1)) # Avoid very small weights
b_conv1 = tf.Variable(tf.truncated_normal([32], stddev=0.1))
h_conv1 = tf.nn.relu(tf.nn.conv2d(x_image, W_conv1, strides=[1, 1, 1, 1], padding='SAME') + b_conv1)
# First max pooling layer (2 * 2) wit stride 2
h_pool1 = tf.nn.max_pool(h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') # 2 * 2 stride => H, W = H, W / 2

In [10]:
# Second conv layer (3 * 3 * 64)
W_conv2 = tf.Variable(tf.truncated_normal([3, 3, 32, 64], stddev=0.1))
b_conv2 = tf.Variable(tf.truncated_normal([64], stddev=0.1))
h_conv2 = tf.nn.relu(tf.nn.conv2d(h_pool1, W_conv2, strides=[1, 1, 1, 1], padding='SAME') + b_conv2)
# Second max pooling layer (2 * 2) with stride 2
h_pool2 = tf.nn.max_pool(h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

In [11]:
# Third conv layer (5 * 5 * 128)
W_conv3 = tf.Variable(tf.truncated_normal([5, 5, 64, 128], stddev=0.1))
b_conv3 = tf.Variable(tf.truncated_normal([128], stddev=0.1))
h_conv3 = tf.nn.relu(tf.nn.conv2d(h_pool2, W_conv3, strides=[1, 1, 1, 1], padding='SAME') + b_conv3)
# Third max pooling layer (2 * 2) with stride 1
h_pool3 = tf.nn.max_pool(h_conv3, ksize=[1, 2, 2, 1], strides=[1, 1, 1, 1], padding='SAME')

In [12]:
# Fully connected layer with 1024 neurons
W_fc1 = tf.Variable(tf.truncated_normal([7 * 7 * 128, 1024], stddev=0.1))
b_fc1 = tf.Variable(tf.truncated_normal([1024], stddev=0.1))
h_pool3_flat = tf.reshape(h_pool3, [-1, 7 * 7 * 128])
h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, W_fc1) + b_fc1)

In [13]:
# Dropout to readuce overfitting
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

In [14]:
# Final output layer (no activations yet; later specify in loss function)
W_fc2 = tf.Variable(tf.truncated_normal([1024, 10], stddev=0.1))
b_fc2 = tf.Variable(tf.truncated_normal([10], stddev=0.1))
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

In [15]:
# Softmax + cross entropy + L2 + ADAM optimizer
cross_entropy_cnn = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
l2_reg_cnn = tf.nn.l2_loss(W_conv1) + tf.nn.l2_loss(W_conv2) + tf.nn.l2_loss(W_fc1) + tf.nn.l2_loss(W_fc2) # Only decay W's
loss_cnn = cross_entropy_cnn + 0.01 * l2_reg_cnn

In [16]:
# A training step:
train_step = tf.train.AdamOptimizer(1e-4).minimize(loss_cnn) # Returns a tf.Operation
# Define accuracy:
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [17]:
saver = tf.train.Saver() 
run_time = time.time()
# Run model session:
with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer()) # Initialization!
    train_size = x_train.shape[0]
    for epoch in range(1, 201):
        i = 0
        while i < train_size:
            if i + 100 < train_size:
                train_step.run(feed_dict={x: x_train[i:i+100, :], y_: y_train[i:i+100], keep_prob: 0.6})
            else:
                train_step.run(feed_dict={x: x_train[i:train_size, :], y_: y_train[i:train_size], keep_prob: 0.6})
            i += 100 # batch size = 100
        if epoch % 10 == 0:
            train_accuracy = accuracy.eval(feed_dict={x: x_train, y_: y_train, keep_prob: 1.0})
            print('Epoch ', epoch, 'training accuracy ', train_accuracy)
    # Save the final model:
    saver.save(sess, 'saved_model.ckpt') 
run_time = time.time() - run_time
print('CNN total running time: ', run_time)


Epoch  10 training accuracy  0.989262
Epoch  20 training accuracy  0.994167
Epoch  30 training accuracy  0.997357
Epoch  40 training accuracy  0.995809
Epoch  50 training accuracy  0.999381
Epoch  60 training accuracy  0.999071
Epoch  70 training accuracy  0.998929
Epoch  80 training accuracy  0.998595
Epoch  90 training accuracy  0.999405
Epoch  100 training accuracy  0.999119
Epoch  110 training accuracy  0.997976
Epoch  120 training accuracy  0.99931
Epoch  130 training accuracy  0.999333
Epoch  140 training accuracy  0.99981
Epoch  150 training accuracy  0.999286
Epoch  160 training accuracy  0.999548
Epoch  170 training accuracy  0.999667
Epoch  180 training accuracy  0.999857
Epoch  190 training accuracy  0.999952
Epoch  200 training accuracy  0.999905
CNN total running time:  17849.82430076599

Time for predictions!


In [18]:
x_test = np.array(test_raw)
with tf.Session(config=config) as sess:
    # Load saved model:
    saver.restore(sess, 'saved_model.ckpt')
    # Run sess using loaded model to make prediction:
    y_pred = sess.run(tf.nn.softmax(logits=y_conv), feed_dict={x: x_test, keep_prob: 1.0})
print(y_pred[0:3, :])


INFO:tensorflow:Restoring parameters from saved_model.ckpt
[[  2.23678825e-10   1.90756022e-10   1.00000000e+00   1.65946568e-09
    1.48918114e-10   5.00106412e-13   3.86252419e-10   2.90928115e-10
    2.62108751e-10   3.50075589e-12]
 [  9.99980688e-01   1.61723085e-06   4.06303161e-06   8.55084394e-08
    3.26068772e-07   1.59932267e-06   2.92567711e-06   2.73764158e-06
    1.10794633e-06   4.70216355e-06]
 [  9.79251399e-06   4.66945539e-06   6.65620746e-06   1.15532666e-05
    2.21098715e-04   1.17985965e-05   2.10299390e-07   7.41176837e-06
    4.41391821e-05   9.99682784e-01]]

In [19]:
y_output = np.argmax(y_pred, axis=1)
print(y_output)


[2 0 9 ..., 3 9 2]

In [20]:
output_label = {'ImageId': range(1, y_output.shape[0]+1)}
predictions = pd.DataFrame(output_label)
predictions['Label'] = pd.Series(y_output)
predictions.to_csv('predictions.csv', index=False)

In [ ]: